##########################################
# Replication Data for Proksch, Lowe, Wäckerle, Soroka. (2018). Multilingual Sentiment Analysis: A New Approach to Measuring Conflict in Legislative Speeches. Legislative Studies Quarterly, Forthcoming.
##########################################

#Part 3: Wordscores Replication
#Most of this code follows the replication provided by Herzog and Benoit for the article "The Most Unkindest Cuts: Speaker Selection and Expressed Government Dissent During Economic Crisis"

rm(list = ls(all = TRUE))
library(foreign)
library(reshape2)
library(ggplot2)
library(rstudioapi)

current_path <- getActiveDocumentContext()$path 
setwd(dirname(current_path ))


### PATHS ############################
# estimates
estFile <- "3_yearspeakerData.RData"

# elections
elecFile <- "Data_elections_data.csv"

# constituencies
constFile <- "Data_constituency_data.csv"

# live register data
lrFile <- "Data_live_register_data_1983-2013.csv"

# constituency-county link file
constLinkFile <- "Data_constituency_county_link_file.csv"

# county population
countyPopFile <- "Data_county_census_population.csv"

# members' date of first speech
firstDailFile <- "Data_members_first_Dail.csv"

# original speeches
speechesDataFile <- "Data_budget_debates_1983-2012.tab"

# minister data
ministerFile <- "Data_ministers.csv"

# output file
outFileR <- "3_master_data.RData"
########################################


# Load data files
# ---------------
load(estFile)
elec <- read.csv(elecFile, header=TRUE,encoding = "UTF-8")
const <- read.csv(constFile, header=TRUE,encoding = "UTF-8")
lr <- read.csv(lrFile, header=TRUE,encoding = "UTF-8")
constCountyLink <- read.csv(constLinkFile, header=TRUE,encoding = "UTF-8")
countyPop <- read.csv(countyPopFile, header=TRUE,encoding = "UTF-8")

# better column names
names(elec) <- gsub("\\.","_",names(elec))
names(const) <- gsub("\\.","_",names(const))
names(lr) <- gsub("\\.","_",names(lr))
names(constCountyLink) <- gsub("\\.","_",names(constCountyLink))
names(countyPop) <- gsub("\\.","_",names(countyPop))


# Create master file with member-year data structure
# --------------------------------------------------
data <- data.frame()

# Dail 24: debate years and budget years 1983-1986
dail <- 24
for (i in 1983:1986) {
    tmp <- elec[elec$dail==dail,]
    tmp$debate_year <- i
    tmp$budget_year <- i
    data <- rbind(data,tmp)}

# Dail 25: debate years and budget years 1987-1989
dail <- 25
for (i in 1987:1989) {
    tmp <- elec[elec$dail==dail,]
    tmp$debate_year <- i
    tmp$budget_year <- i
    data <- rbind(data,tmp)}

# Dail 26: debate years and budget years 1990-1992
dail <- 26
for (i in 1990:1992) {
    tmp <- elec[elec$dail==dail,]
    tmp$debate_year <- i
    tmp$budget_year <- i
    data <- rbind(data,tmp)}

# Dail 27: debate years and budget years 1993-1997
dail <- 27
for (i in 1993:1997) {
    tmp <- elec[elec$dail==dail,]
    tmp$debate_year <- i
    tmp$budget_year <- i
    data <- rbind(data,tmp)}

# Dail 28: debate years 1997-2001, budget year 1998-2002
dail <- 28
for (i in 1997:2001) {
    tmp <- elec[elec$dail==dail,]
    tmp$debate_year <- i
    tmp$budget_year <- i+1
    data <- rbind(data,tmp)}

# Dail 29: debate years 2002-2006, budget year 2003-2007
dail <- 29
for (i in 2002:2006) {
    tmp <- elec[elec$dail==dail,]
    tmp$debate_year <- i
    tmp$budget_year <- i+1
    data <- rbind(data,tmp)}

# Dail 30: debate years 2007-2010, budget year 2008-2011
dail <- 30
for (i in 2007:2010) {
    tmp <- elec[elec$dail==dail,]
    tmp$debate_year <- i
    tmp$budget_year <- i+1
    data <- rbind(data,tmp)}

# Dail 31: debate years 2011-2012, budget year 2012-2013
dail <- 31
for (i in 2011:2012) {
    tmp <- elec[elec$dail==dail,]
    tmp$debate_year <- i
    tmp$budget_year <- i+1
    data <- rbind(data,tmp)}


# Add estimates
# -------------
# drop speakers without estimated textscore from file
yearspeakerData <- yearspeakerData[!is.na(yearspeakerData$textscore),]
# drop irrelevant columns
excl <- c("constID","partyID","constituency","position", "position_numeric", "department", "department2", "department3", "department4")

data <- merge(data, yearspeakerData[, !(names(yearspeakerData) %in% excl)], by=c("memberID","budget_year"), all.x=TRUE)



# Indicater for TDs who resigned or died in office
# ------------------------------------------------
data$removed <- 0

# 24th Dáil, Death of Clement Coughlan: February 1, 1983
data$removed[data$name=="Mr. Clement Coughlan" & data$budget_year>=1983 & is.na(data$textscore)] <- 1

# 24th Dáil, Death of George Colley: September 17, 1983
data$removed[data$name=="Mr. George Colley" & data$budget_year>=1984 & is.na(data$textscore)] <- 1

# 24th Dáil, Death of Bernard Cowen: January 24, 1984
data$removed[data$name=="Mr. Bernard F. Cowen" & data$budget_year>=1984 & is.na(data$textscore)] <- 1

# 27th Dáil, Resignation of John O'Connell: February 1993
data$removed[data$name=="Dr. John F. O'Connell (Deceased)" & data$budget_year>=1994 & is.na(data$textscore)] <- 1

# 27th Dáil, Resignation of Pádraig Flynn on appointment as EU Commissioner: January 1993
data$removed[data$name=="Mr. Pádraig Flynn" & data$budget_year>=1993 & is.na(data$textscore)] <- 1

# 27th Dáil, Death of Gerry O'Sullivan: August 5, 1994
data$removed[data$name=="Mr. Gerry O'Sullivan" & data$budget_year>=1995 & is.na(data$textscore)] <- 1

# 27th Dáil, Resignation of Pat Cox after election as MEP: 1994
data$removed[data$name=="Mr. Pat Cox" & data$budget_year>=1995 & is.na(data$textscore)] <- 1

# 27th Dáil, Death of Johnny Fox: March 17, 1995
data$removed[data$name=="Mr. Johnny Fox" & data$budget_year>=1996 & is.na(data$textscore)] <- 1

# 27th Dáil, Death of Neil Blaney: November 8, 1995
data$removed[data$name=="Mr. Neil T. Blaney" & data$budget_year>=1996 & is.na(data$textscore)] <- 1

# 27th Dáil, Death of Brian Lenihan, Snr: November 1, 1995
data$removed[data$name=="Mr. Brian (Snr.) Lenihan" & data$budget_year>=1996 & is.na(data$textscore)] <- 1

# 28th Dáil, Death of Jim Kemmy: September 25, 1997
data$removed[data$name=="Mr. Jim Kemmy" & data$budget_year>=1998 & is.na(data$textscore)] <- 1

# 28th Dáil, Resignation of Ray Burke: 7 October 1997
data$removed[data$name=="Mr. Raphael P. (Ray) Burke" & data$budget_year>=1998 & is.na(data$textscore)] <- 1

# 28th Dáil, Death of Hugh Coveney: March 15, 1998
data$removed[data$name=="Mr. Hugh Coveney" & data$budget_year>=1999 & is.na(data$textscore)] <- 1

# 28th Dáil, Death of Pat Upton: February 22, 1999
data$removed[data$name=="Dr. Pat Upton" & data$budget_year>=2000 & is.na(data$textscore)] <- 1

# 28th Dáil, Death of Michael Ferris: March 20, 2000
data$removed[data$name=="Mr. Michael Ferris" & data$budget_year>=2001 & is.na(data$textscore)] <- 1

# 28th Dáil, Death of Theresa Ahearn: September 20, 2000
data$removed[data$name=="Mrs. Theresa Ahearn" & data$budget_year>=2001 & is.na(data$textscore)] <- 1

# 29th Dáil, Resignation of Charlie McCreevy on appointment as EU Commissioner: October 2004
data$removed[data$name=="Mr. Charlie McCreevy" & data$budget_year>=2005 & is.na(data$textscore)] <- 1

# 29th Dáil, Resignation of John Bruton on appointment as EU Ambassador to US: 24 November 2004
data$removed[data$name=="Mr. John Bruton" & data$budget_year>=2005 & is.na(data$textscore)] <- 1

# 30th Dáil, Death of Séamus Brennan: July 9, 2008
data$removed[data$name=="Mr. Seamus Brennan (Deceased)" & data$budget_year>=2009 & is.na(data$textscore)] <- 1

# 30th Dáil, Death of Tony Gregory: January 2, 2009
data$removed[data$name=="Mr. Tony Gregory (Deceased)" & data$budget_year>=2010 & is.na(data$textscore)] <- 1

# 30th Dáil, Election of Pat the Cope Gallagher to the European Parliament: June 2009
data$removed[data$name=="Mr. Pat (The Cope) Gallagher (Seat vacated 06-06-2009)" & data$budget_year>=2010 & is.na(data$textscore)] <- 1

# 31st Dáil, Death of Brian Lenihan, Jnr: June 10, 2011
data$removed[data$name=="Mr. Brian Joseph Lenihan RIP" & data$budget_year>=2012 & is.na(data$textscore)] <- 1

# 31st Dáil, Death of Shane McEntee: December 21, 2012
data$removed[data$name=="Mr. Shane McEntee RIP" & data$budget_year>=2012 & is.na(data$textscore)] <- 1



# Indicater for TDs who were not elected yet (by-elections)
# ---------------------------------------------------------
data$notElectedYet <- 0

# Cathal Coughlan: 13 May 1983
data$notElectedYet[data$dail==24 & data$name=="Mr. Cathal Coughlan" & data$budget_year<=1983 & is.na(data$textscore)] <- 1

# Brian Cowen: 14 June 1984
data$notElectedYet[data$dail==24 & data$name=="Mr. Brian Cowen" & data$budget_year<=1984 & is.na(data$textscore)] <- 1

# Eric Byrne: 9 June 1994
data$notElectedYet[data$dail==27 & data$name=="Mr. Eric Byrne" & data$budget_year<=1994 & is.na(data$textscore)] <- 1

# Michael Ring: 9 June 1994
data$notElectedYet[data$dail==27 & data$name=="Mr. Michael Ring" & data$budget_year<=1994 & is.na(data$textscore)] <- 1

# Kathleen Lynch: 10 November 1994
data$notElectedYet[data$dail==27 & data$name=="Ms. Kathleen Lynch" & data$budget_year<=1994 & is.na(data$textscore)] <- 1

# Hugh Coveney: 10 November 1994
data$notElectedYet[data$dail==27 & data$name=="Mr. Hugh Coveney" & data$budget_year<=1994 & is.na(data$textscore)] <- 1

# Mildred Fox: 29 June 1995
data$notElectedYet[data$dail==27 & data$name=="Ms. Mildred Fox" & data$budget_year<=1995 & is.na(data$textscore)] <- 1

# Cecilia Keaveney: 2 April 1996
data$notElectedYet[data$dail==27 & data$name=="Ms. Cecilia Keaveney" & data$budget_year<=1996 & is.na(data$textscore)] <- 1

# Brian Lenihan, Jnr: 2 April 1996
data$notElectedYet[data$dail==27 & data$name=="Mr. Brian Joseph Lenihan RIP" & data$budget_year<=1996 & is.na(data$textscore)] <- 1

# Jan O'Sullivan: 11 March 1998
data$notElectedYet[data$dail==28 & data$name=="Ms. Jan O'Sullivan" & data$budget_year<=1998 & is.na(data$textscore)] <- 1

# Seán Ryan: 11 March 1998
data$notElectedYet[data$dail==28 & data$name=="Mr. Sean Ryan" & data$budget_year<=1998 & is.na(data$textscore)] <- 1

# Simon Coveney: 23 October 1998
data$notElectedYet[data$dail==28 & data$name=="Mr. Simon Coveney" & data$budget_year<=1998 & is.na(data$textscore)] <- 1

# Mary Upton: 27 October 1999
data$notElectedYet[data$dail==28 & data$name=="Dr. Mary Upton" & data$budget_year<=1999 & is.na(data$textscore)] <- 1

# Séamus Healy: 22 June 2000
data$notElectedYet[data$dail==28 & data$name=="Mr. Seamus Healy" & data$budget_year<=2000 & is.na(data$textscore)] <- 1

# Tom Hayes: 30 June 2001
data$notElectedYet[data$dail==28 & data$name=="Mr. Tom Hayes" & data$budget_year<=2001 & is.na(data$textscore)] <- 1

# Catherine Murphy: 11 March 2005
data$notElectedYet[data$dail==29 & data$name=="Ms. Catherine Murphy" & data$budget_year<=2005 & is.na(data$textscore)] <- 1

# Shane McEntee: 11 March 2005
data$notElectedYet[data$dail==29 & data$name=="Mr. Shane McEntee RIP" & data$budget_year<=2005 & is.na(data$textscore)] <- 1

# George Lee: 5 June 2009
data$notElectedYet[data$dail==30 & data$name=="Mr. George Lee (Resigned)" & data$budget_year<=2009 & is.na(data$textscore)] <- 1

# Maureen O'Sullivan: 5 June 2009
data$notElectedYet[data$dail==30 & data$name=="Ms. Maureen O'Sullivan" & data$budget_year<=2009 & is.na(data$textscore)] <- 1

# Pearse Doherty: 25 November 2010
data$notElectedYet[data$dail==30 & data$name=="Mr. Pearse Doherty" & data$budget_year<=2010 & is.na(data$textscore)] <- 1

# Patrick Nulty: 27 October 2011
data$notElectedYet[data$dail==31 & data$name=="Mr. Patrick Nulty" & data$budget_year<=2011 & is.na(data$textscore)] <- 1


# Add constituency data
# ---------------------
excl <- c("constituency")
data <- merge(data, const[,!(names(const) %in% excl)], by=c("election_year","constID"))


# Add constituency-county link file
# ---------------------------------
data <- merge(data, constCountyLink, by.x="constituency", by.y="const_name")


# Add live register data (measured at county level)
# -------------------------------------------------
# wide to long format
lr <- melt(lr, id="County")
names(lr) <- c("county","year","lr_abs")
lr$year <- as.numeric(gsub("y", "", as.character(lr$year)))

data <- merge(data, lr, by.x=c("county","debate_year"), by.y=c("county","year"))

# add previous year unemployment rate
lr.lag <- lr
lr.lag$year <- lr.lag$year + 1
names(lr.lag)[names(lr.lag)=="lr_abs"] <- "lr_abs_lag"

data <- merge(data, lr.lag, by.x=c("county","debate_year"), by.y=c("county","year"), all.x=TRUE)


# Add county population
# ---------------------
# South Tipperary and North Tipperary are two different counties, but we only have live register data for all of Tipperary, so we here take the average population
countyPop$county <- as.character(countyPop$county)
countyPop$county[countyPop$county=="South Tipperary"] <- "Tipperary"
countyPop$county[countyPop$county=="North Tipperary"] <- "Tipperary"

countyPop <- aggregate(countyPop$population, by=list(countyPop$county,countyPop$census_year), mean)
names(countyPop) <- c("county","census_year","population")

# long to wide
countyPop <- dcast(data=countyPop, county ~ census_year, value.var="population")
names(countyPop) <- c("county",paste0("pop",names(countyPop)[2:ncol(countyPop)]))

# merge to master data
data <- merge(data, countyPop, by="county")

# code for each county in each debate year the population data from the closest census statistic
data$population <- NA

yearRange <- (data$debate_year==1983)
data$population[yearRange] <- data$pop1981[yearRange]

yearRange <- (data$debate_year>=1984 & data$debate_year<=1988)
data$population[yearRange] <- data$pop1986[yearRange]

yearRange <- (data$debate_year>=1989 & data$debate_year<=1993)
data$population[yearRange] <- data$pop1991[yearRange]

yearRange <- (data$debate_year>=1994 & data$debate_year<=1999)
data$population[yearRange] <- data$pop1996[yearRange]

yearRange <- (data$debate_year>=2000 & data$debate_year<=2004)
data$population[yearRange] <- data$pop2002[yearRange]

yearRange <- (data$debate_year>=2005 & data$debate_year<=2008)
data$population[yearRange] <- data$pop2006[yearRange]

yearRange <- (data$debate_year>=2009)
data$population[yearRange] <- data$pop2011[yearRange]


# Calculate seniority from date of first speech
# ---------------------------------------------
# add first date of debate
data$debate_dateStart <- NA
data$debate_dateStart[data$budget_year==1983] <- "1983-03-31"
data$debate_dateStart[data$budget_year==1984] <- "1984-01-25"
data$debate_dateStart[data$budget_year==1985] <- "1985-01-30"
data$debate_dateStart[data$budget_year==1986] <- "1986-01-29"
data$debate_dateStart[data$budget_year==1987] <- "1987-03-31"
data$debate_dateStart[data$budget_year==1988] <- "1988-01-27"
data$debate_dateStart[data$budget_year==1989] <- "1989-01-25"
data$debate_dateStart[data$budget_year==1990] <- "1990-01-31"
data$debate_dateStart[data$budget_year==1991] <- "1991-01-30"
data$debate_dateStart[data$budget_year==1992] <- "1992-01-29"
data$debate_dateStart[data$budget_year==1993] <- "1993-02-24"
data$debate_dateStart[data$budget_year==1994] <- "1994-01-26"
data$debate_dateStart[data$budget_year==1995] <- "1995-02-08"
data$debate_dateStart[data$budget_year==1996] <- "1996-01-23"
data$debate_dateStart[data$budget_year==1997] <- "1997-01-22"
data$debate_dateStart[data$budget_year==1998] <- "1997-12-03"
data$debate_dateStart[data$budget_year==1999] <- "1998-12-02"
data$debate_dateStart[data$budget_year==2000] <- "1999-12-01"
data$debate_dateStart[data$budget_year==2001] <- "2000-12-06"
data$debate_dateStart[data$budget_year==2002] <- "2001-12-05"
data$debate_dateStart[data$budget_year==2003] <- "2002-12-04"
data$debate_dateStart[data$budget_year==2004] <- "2003-12-03"
data$debate_dateStart[data$budget_year==2005] <- "2004-12-01"     
data$debate_dateStart[data$budget_year==2006] <- "2005-12-07"     
data$debate_dateStart[data$budget_year==2007] <- "2006-12-06"
data$debate_dateStart[data$budget_year==2008] <- "2007-12-05"     
data$debate_dateStart[data$budget_year==2009] <- "2008-10-14" 
data$debate_dateStart[data$budget_year==2010] <- "2009-12-09"     
data$debate_dateStart[data$budget_year==2011] <- "2010-12-07"     
data$debate_dateStart[data$budget_year==2012] <- "2011-12-06"     
data$debate_dateStart[data$budget_year==2013] <- "2012-12-5"

data$debate_dateStart <- as.Date(data$debate_dateStart, "%Y-%m-%d")

# add first dail to data
firstDail <- read.csv(firstDailFile, header=TRUE,encoding = "UTF-8")

# add date of first Dail session
firstDail$dateFirstDail <- NA
firstDail$dateFirstDail[firstDail$firstDail==4] <- "1923-09-19"
firstDail$dateFirstDail[firstDail$firstDail==5] <- "1927-06-23"
firstDail$dateFirstDail[firstDail$firstDail==6] <- "1927-10-11"
firstDail$dateFirstDail[firstDail$firstDail==7] <- "1932-03-09"
firstDail$dateFirstDail[firstDail$firstDail==8] <- "1933-02-08"
firstDail$dateFirstDail[firstDail$firstDail==9] <- "1937-07-21"
firstDail$dateFirstDail[firstDail$firstDail==10] <- "1938-06-30"
firstDail$dateFirstDail[firstDail$firstDail==11] <- "1943-07-01"
firstDail$dateFirstDail[firstDail$firstDail==12] <- "1944-06-09"
firstDail$dateFirstDail[firstDail$firstDail==13] <- "1948-02-18"
firstDail$dateFirstDail[firstDail$firstDail==14] <- "1951-06-13"
firstDail$dateFirstDail[firstDail$firstDail==15] <- "1954-06-02"
firstDail$dateFirstDail[firstDail$firstDail==16] <- "1957-03-20"
firstDail$dateFirstDail[firstDail$firstDail==17] <- "1961-10-11"
firstDail$dateFirstDail[firstDail$firstDail==18] <- "1965-04-21"
firstDail$dateFirstDail[firstDail$firstDail==19] <- "1969-07-02"
firstDail$dateFirstDail[firstDail$firstDail==20] <- "1973-03-04"
firstDail$dateFirstDail[firstDail$firstDail==21] <- "1977-07-05"
firstDail$dateFirstDail[firstDail$firstDail==22] <- "1981-06-30"
firstDail$dateFirstDail[firstDail$firstDail==23] <- "1982-03-09"
firstDail$dateFirstDail[firstDail$firstDail==24] <- "1982-12-14"
firstDail$dateFirstDail[firstDail$firstDail==25] <- "1987-03-10"
firstDail$dateFirstDail[firstDail$firstDail==26] <- "1989-06-29"
firstDail$dateFirstDail[firstDail$firstDail==27] <- "1992-12-14"
firstDail$dateFirstDail[firstDail$firstDail==28] <- "1997-06-26"
firstDail$dateFirstDail[firstDail$firstDail==29] <- "2002-06-06"
firstDail$dateFirstDail[firstDail$firstDail==30] <- "2007-06-14"
firstDail$dateFirstDail[firstDail$firstDail==31] <- "2011-03-09"

firstDail$dateFirstDail <- as.Date(firstDail$dateFirstDail, "%Y-%m-%d")

# merge to data file
data <- merge(data, firstDail, by="memberID")

# calculate seniority in months
data$seniorityDays <- difftime(data$debate_date, data$dateFirstDail, unit="days")

data$seniorityWeeks <- difftime(data$debate_date, data$dateFirstDail, unit="weeks")

data$seniorityYears <- as.numeric(data$seniorityWeeks)/52.25


table(data$party,exclude=NULL)
# Party label corrections
# -----------------------
data$party <- as.character(data$party)

# PD dissolved on 20 November 2009
# we treat Mary Harney as FF member and Noel Grealish as Independent
data$party[data$name=="Ms. Mary Harney" & data$party=="Other" & data$debate_year<=2008] <- "Progressive Democrats"
data$party[data$name=="Ms. Mary Harney" & data$party=="Other" & data$debate_year>=2009] <- "FF"

data$party[data$name=="Mr. Noel Grealish" & data$party=="Other" & data$debate_year<=2008] <- "Progressive Democrats"
data$party[data$name=="Mr. Noel Grealish" & data$party=="Other" & data$debate_year>=2009] <- "Independent"

# DL merged with Labour in 1999
data$party[data$party=="Democratic Left" & data$debate_year>=1999] <- "Labour Party"



# Party abbreviations
# -------------------
# code party of Ceann Comhairle from party_name variable
data$party[data$party=="Ceann Comhairle"] <- data$party_name[data$party=="Ceann Comhairle"]

data$partyAbbrev <- NA
data$partyAbbrev[data$party=="Democratic Left"] <- "DL"                                    
data$partyAbbrev[data$party=="Democratic Socialist Party"] <- "DSP"
data$partyAbbrev[data$party=="Fianna Fáil"] <- "FF"
data$partyAbbrev[data$party=="FF"] <- "FF"
data$partyAbbrev[data$party=="Fine Gael"] <- "FG"
data$partyAbbrev[data$party=="Green Party"] <- "GRE"
data$partyAbbrev[data$party=="Independent"] <- "Indp"
data$partyAbbrev[data$party=="Independent Fianna Fáil"] <- "Indp"
data$partyAbbrev[data$party=="Other"] <- "Other"
data$partyAbbrev[data$party=="People Before Profit Alliance"] <- "PBPA"
data$partyAbbrev[data$party=="Progressive Democrats"] <- "PD"
data$partyAbbrev[data$party=="Sinn Féin"] <- "SF"
data$partyAbbrev[data$party=="Socialist Party"] <- "SP"
data$partyAbbrev[data$party=="Socialist Party 2011"] <- "SP"
data$partyAbbrev[data$party=="Labour Party"] <- "LAB"
data$partyAbbrev[data$party=="The Labour Party"] <- "LAB"
data$partyAbbrev[data$party=="The Workers' Party"] <- "WP"
data$partyAbbrev[data$party=="Workers and Unemployed Action Group South-Tipperary"] <- "WUAG"

# Code government periods
# -----------------------
data$period <- 0
data$period[data$budget_year %in% 1983:1986] <- 1
data$period[data$budget_year %in% 1987:1989] <- 2
data$period[data$budget_year %in% 1990:1994] <- 3
data$period[data$budget_year %in% 1995:1997] <- 4
data$period[data$budget_year %in% 1998:2008] <- 5
data$period[data$budget_year %in% 2009:2011] <- 6
data$period[data$budget_year %in% 2012:2013] <- 7
data$period <- factor(data$period, labels=c("FG-Lab Coalition",
                                            "FF Minority Govt",
                                            "FF Coal",
                                            "FG-Lab-DL Rainbow Coal",
                                            "FF-led Boom Years",
                                            "FF-Gr Crisis Years",
                                            "FG-Lab Crisis Years"))
with(data, table(period, budget_year))  # verify


# actual government start dates
data$govtStart <- NA
data$govtEnd <- NA

data$govtStart[data$period=="FG-Lab Coalition"] <- "1982-12-14"
data$govtEnd[data$period=="FG-Lab Coalition"] <- "1987-03-10"

data$govtStart[data$period=="FF Minority Govt"] <- "1987-03-10"
data$govtEnd[data$period=="FF Minority Govt"] <- "1989-07-12"

data$govtStart[data$period=="FF Coal"] <- "1989-07-12"
data$govtEnd[data$period=="FF Coal"] <- "1994-12-15"

data$govtStart[data$period=="FG-Lab-DL Rainbow Coal"] <- "1994-12-15"
data$govtEnd[data$period=="FG-Lab-DL Rainbow Coal"] <- "1997-06-26"

data$govtStart[data$period=="FF-led Boom Years"] <- "1997-06-26"
data$govtEnd[data$period=="FF-led Boom Years"] <- "2008-05-07"

data$govtStart[data$period=="FF-Gr Crisis Years"] <- "2008-05-07"
data$govtEnd[data$period=="FF-Gr Crisis Years"] <- "2011-03-09"

data$govtStart[data$period=="FG-Lab Crisis Years"] <- "2011-03-09"



# Code economic periods
# ---------------------
data$periodEcon <- 0
data$periodEcon[data$budget_year %in% 1983:2008] <- 1
data$periodEcon[data$budget_year %in% 2009:2013] <- 2
data$periodEcon <- factor(data$periodEcon, labels=c("Pre-crisis", "Crisis"))
with(data, table(budget_year, periodEcon))  # verify


# Code parties in government and opposition
# -----------------------------------------
data$govt <- 0

# 1983 - 1986: Fine Gael + Labour
data$govt[(data$debate_year>=1983 & data$debate_year<=1986) & (data$partyAbbrev=="LAB" | data$partyAbbrev=="FG")] <- 1

# 1987 - 1989: Fianna Fail
data$govt[(data$debate_year>=1987 & data$debate_year<=1989) & data$partyAbbrev=="FF"] <- 1

# 1990 - 1992: Fianna Fail + Progressive Democrats
data$govt[(data$debate_year>=1990 & data$debate_year<=1992) & (data$partyAbbrev=="FF" | data$partyAbbrev=="PD")] <- 1

# 1993 - 1994: Fianna Fail + Labour
data$govt[(data$debate_year>=1993 & data$debate_year<=1994) & (data$partyAbbrev=="FF" | data$partyAbbrev=="LAB")] <- 1

# 1995 - 1997: Fianna Gael + Labour + Democratic Left
# (use budget year for 1997 because there were two debates in 1997)
data$govt[(data$debate_year>=1995 & data$budget_year<=1997) & (data$partyAbbrev=="FG" | data$partyAbbrev=="LAB" | data$partyAbbrev=="DL")] <- 1

# 1998 - 2006: Fianna Fail + Progressive Democrats
data$govt[(data$budget_year>=1998 & data$debate_year<=2006) & (data$partyAbbrev=="FF" | data$partyAbbrev=="PD")] <- 1

# 2007 - 2010: Fianna Fail + Green + Progressive Democrats
data$govt[(data$debate_year>=2007 & data$debate_year<=2010) & (data$partyAbbrev=="FF" | data$partyAbbrev=="PD" | data$partyAbbrev=="GRE")] <- 1

# 2011 - 2013: Fine Gael + Labour
data$govt[(data$debate_year>=2011 & data$debate_year<=2013) & (data$partyAbbrev=="FG" | data$partyAbbrev=="LAB")] <- 1


# convert to factor
data$govt <- factor(data$govt, label = c("Opposition","Government"))



# Generate cabinet member variable
# --------------------------------
# read minister data
ministers <- read.csv(ministerFile, stringsAsFactor=FALSE,encoding = "UTF-8")

# replace missing values in end date with placeholder date
ministers$end_date[ministers$end_date=="NULL"] <- "2014-01-01"

# only keep ministers in office at time of budget debate and reduce to unique cases
# (some ministers have multiple positions and hence appear multiple times in the data set)
ministersUnique <- data.frame()
for (i in unique(data$budget_year)) {    
    debate_date <- unique(data$debate_dateStart[data$budget_year==i])

    # reduce to ministers in office
    m <- ministers[(ministers$start_date <= debate_date) & (ministers$end_date > debate_date),]

    # deleted duplicates, making sure to keep a member's highest position
    m$position <- factor(m$position, levels=c("Taoiseach", "Tánaiste", "Minister", "Minister of State"))
    m <- m[order(m$memberID, m$position),]
    m <- m[!duplicated(m$memberID),]

    # add budget year and add to data frame
    m$budget_year <- i
    ministersUnique <- rbind(m, ministersUnique)   
}

# merge with main data set
data <- merge(data, ministersUnique[, c("budget_year", "memberID", "position", "department")], by=c("memberID", "budget_year"), all.x=TRUE)

table(ministersUnique$position)
# Add govt backbencher and opposition speaker to position column
# --------------------------------------------------------------
data$position <- as.character(data$position)

data$position[is.na(data$position) & data$govt=="Government"] <- "Govt backbencher" 
data$position[is.na(data$position) & data$govt=="Opposition"] <- "Opposition" 

data$cabMember <- 0
data$cabMember[data$position %in% c("Taoiseach","Tánaiste","Minister","Minister of State")] <- 1
data$position <- factor(data$position)


# Calculate number of days on which debate was held from original speeches file
# -----------------------------------------------------------------------------
speechesData <- read.delim(speechesDataFile, stringsAsFactors=FALSE)

# reduce to unique dates
speechesData <- speechesData[!duplicated(speechesData$date),]

# aggregate by year
daysData <- aggregate(speechesData$year, by=list(speechesData$year), length)
names(daysData) <- c("debate_year", "debate.days")

# add to main data frame
data <- merge(data, daysData, by="debate_year")


# Code party leaders
# ------------------
data$party.leader <- 0


# Democratic Left
data$party.leader[data$partyAbbrev=="DL" & data$name=="Mr. Proinsias De Rossa"] <- 1

# Democratic Socialist Party
data$party.leader[data$partyAbbrev=="DSP" & data$name=="Mr. Jim Kemmy"] <- 1

# Fianna Fail
data$party.leader[data$partyAbbrev=="FF" & data$debate_year<=1992 & data$name=="Mr. Charles J. Haughey"] <- 1
data$party.leader[data$partyAbbrev=="FF" & data$debate_year>=1993 & data$debate_year<=1994 & data$name=="Mr. Albert Reynolds"] <- 1
data$party.leader[data$partyAbbrev=="FF" & data$debate_year>=1995 & data$debate_year<=2007 & data$name=="Mr. Bertie Ahern"] <- 1
data$party.leader[data$partyAbbrev=="FF" & data$debate_year>=2008 & data$debate_year<=2010 & data$name=="Mr. Brian Cowen"] <- 1
data$party.leader[data$partyAbbrev=="FF" & data$debate_year>=2011 & data$name=="Mr. Micheál Martin"] <- 1

# Fine Gael
data$party.leader[data$partyAbbrev=="FG" & data$debate_year<=1986 & data$name=="Mr. Garret FitzGerald"] <- 1
data$party.leader[data$partyAbbrev=="FG" & data$debate_year>=1987 & data$debate_year<=1990 & data$name=="Mr. Alan Dukes"] <- 1
data$party.leader[data$partyAbbrev=="FG" & data$debate_year>=1991 & data$debate_year<=2000 & data$name=="Mr. John Bruton"] <- 1
data$party.leader[data$partyAbbrev=="FG" & data$debate_year==2001 & data$name=="Mr. Michael Noonan"] <- 1
data$party.leader[data$partyAbbrev=="FG" & data$debate_year>=2002 & data$name=="Mr. Enda Kenny"] <- 1

# Greens
# no official leader before 2001
data$party.leader[data$partyAbbrev=="GRE" & data$debate_year>=2001 & data$debate_year<=2006 & data$name=="Mr. Trevor Sargent"] <- 1
data$party.leader[data$partyAbbrev=="GRE" & data$debate_year>=2007 & data$debate_year<=2010 & data$name=="Mr. John Gormley"] <- 1
data$party.leader[data$partyAbbrev=="GRE" & data$debate_year>=2011 & data$name=="Mr. Eamon Ryan"] <- 1

# Labour
data$party.leader[data$partyAbbrev=="LAB" & data$budget_year<=1997 & data$name=="Mr. Dick Spring"] <- 1
data$party.leader[data$partyAbbrev=="LAB" & data$budget_year>=1998 & data$debate_year<=2001 & data$name=="Mr. Ruairí Quinn"] <- 1
data$party.leader[data$partyAbbrev=="LAB" & data$debate_year>=2002 & data$debate_year<=2006 & data$name=="Mr. Pat Rabbitte"] <- 1
data$party.leader[data$partyAbbrev=="LAB" & data$debate_year>=2007 & data$name=="Mr. Eamon Gilmore"] <- 1

# People Before Profit Alliance
# Collective Leadership

# Progressive Democrats
data$party.leader[data$partyAbbrev=="PD" & data$debate_year<=1993 & data$name=="Mr. Desmond J. O'Malley"] <- 1
data$party.leader[data$partyAbbrev=="PD" & data$debate_year>=1994 & data$debate_year<=2005 & data$name=="Ms. Mary Harney"] <- 1
data$party.leader[data$partyAbbrev=="PD" & data$debate_year==2006 & data$name=="Mr. Michael McDowell"] <- 1
data$party.leader[data$partyAbbrev=="PD" & data$debate_year>=2007 & data$name=="Ms. Mary Harney"] <- 1 # replaced by Ciarán Cannon, but who was from the Seanad

# Sinn Fein
data$party.leader[data$partyAbbrev=="SF" & data$debate_year<=2010 & data$name=="Mr. Caoimhghín Ó Caoláin"] <- 1 # leader of the parliamentary group
data$party.leader[data$partyAbbrev=="SF" & data$debate_year>=2011 & data$name=="Mr. Gerry Adams"] <- 1 

# Socialist Party
# Collective Leadership

# Worker's Party
data$party.leader[data$partyAbbrev=="WP" & data$debate_year<=1988 & data$name=="Mr. Tomás Mac Giolla"] <- 1
data$party.leader[data$partyAbbrev=="WP" & data$debate_year>=1989 & data$name=="Mr. Proinsias De Rossa"] <- 1

# Workers and Unemployed Action Group South-Tipperary
data$party.leader[data$partyAbbrev=="WUAG" & data$name=="Mr. Seamus Healy"] <- 1


# Misc
# ----
# encode character variables
data$first_preference_votes <- gsub(",","",data$first_preference_votes)
data$first_preference_votes <- as.numeric(data$first_preference_votes)

data$quota <- gsub(",","",data$quota)
data$quota <- as.numeric(data$quota)


# code empty strings to NA and encode
data$first_name[data$first_name==""] <- NA
data$first_name <- iconv(data$first_name)
 

# Save data
# ---------
# R image
save(data, file=outFileR)

